• Practice loading CSV files into Python using the CSV.reader() function in the standard library
  • Practice loading CSV files using NumPy and the numpy.loadtxt() function
  • Practice loading CSV files using Pandas and the pandas.read_csv() function

Advanced seaborn


In [1]:
# Allow Jupyter Notebook to render plots inline
%matplotlib inline

In [2]:
####### IGNORE Warning that appears when executing this cell, it is an implementation issue for the seaborn module

# Load modules
import matplotlib
import matplotlib.pyplot as plt
import os as os
import pandas as pd
import sqlalchemy as sa
import seaborn as sns

# Load classes
from time import strftime
from pandas.tools.plotting import scatter_matrix


C:\Miniconda3\envs\sandpit\lib\site-packages\IPython\html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

In [3]:
# create engine
engine_name = 'mssql+pyodbc'
host_name = 'au04db502nml3'
db_name = 'OneViewStaging_Telfer'
driver_name = 'SQL+Server+Native+Client+11.0'
cx = engine_name + '://' + host_name + '/' + db_name + '?driver=' + driver_name
engine = sa.create_engine(cx)

In [4]:
# define the query here to select the data wanted
sql = """
        select
            [StartDateTime]
            ,[Total Material Moved (t)]
            ,[Total ExPit Material Moved (t)]
            ,[Total ExPit Ore (t)]
            ,[Direct Feed & OP Rehandle (t)]
            ,[Direct Feed (t)]
            ,[Leach Ore (t)]
            ,[ExPit Ore (t)]
            ,[Waste (t)]
            ,[Rehandle (t)]
            ,[Relocation (t)]
            ,[TopSoil (t)]
            ,[Scats (t)]
            ,[MReef (OP) (t)]
            ,[SLC (t)]
            ,[M35 (t)] 
        from
            [dbo].[tb_OPCO_TELReconcilorData] 
        order by
            StartDateTime
        ;"""

In [5]:
# get the data from the db
df = pd.read_sql(sql, engine)

In [6]:
# We're' going to 'clean' the data of null values; the following columns have no
# data in them so we need to remove them to ensure our cleaning in the next step 
# doesn't delete all the data
cols_to_drop = ['TopSoil (t)']
df.drop(cols_to_drop, axis=1, inplace=True) 

# get rid of rows with nulls
# and add column with day of week to use for analysis
df.dropna(axis=0, how='any', inplace=True)
df['Day Of Week'] = (df['StartDateTime']).dt.dayofweek

In [7]:
# Summary stats for the results in the query
df.describe().transpose()


Out[7]:
count mean std min 25% 50% 75% max
Total Material Moved (t) 57.0 107863.561404 17303.205930 70338.0 98240.0 105685.0 116237.0 165799.0
Total ExPit Material Moved (t) 57.0 68173.631579 14781.200515 32296.0 60391.0 67918.0 76299.0 118559.0
Total ExPit Ore (t) 57.0 39850.859649 18667.368628 2928.0 25845.0 38293.0 50526.0 82386.0
Direct Feed & OP Rehandle (t) 57.0 47279.228070 12535.078728 6588.0 40709.0 48821.0 55431.0 69876.0
Direct Feed (t) 57.0 20889.385965 13338.240330 1347.0 10233.0 18091.0 29725.0 52048.0
Leach Ore (t) 57.0 8052.614035 7814.593586 441.0 2226.0 4528.0 12558.0 30588.0
ExPit Ore (t) 57.0 10908.859649 8598.550294 246.0 4337.0 8125.0 16598.0 39774.0
Waste (t) 57.0 28322.771930 18512.587420 1170.0 13493.0 26554.0 39336.0 82553.0
Rehandle (t) 57.0 25381.280702 13579.823176 1395.0 15108.0 23802.0 34491.0 52426.0
Relocation (t) 57.0 2149.719298 2530.000161 173.0 237.0 697.0 3805.0 8512.0
Scats (t) 57.0 11150.368421 2037.681610 6867.0 9560.0 11065.0 12153.0 16888.0
MReef (OP) (t) 57.0 1008.561404 462.662289 194.0 682.0 962.0 1228.0 2179.0
SLC (t) 57.0 13530.701754 4147.210276 430.0 11750.0 14320.0 16040.0 22160.0
M35 (t) 57.0 898.175439 377.152069 101.0 604.0 990.0 1198.0 1618.0
Day Of Week 57.0 3.438596 1.991209 0.0 2.0 4.0 5.0 6.0

In [17]:
# Lets view the first 5 rows of the data set
df.head()


Out[17]:
Total Material Moved (t) Total ExPit Material Moved (t) Total ExPit Ore (t) Direct Feed & OP Rehandle (t) Direct Feed (t) Leach Ore (t) ExPit Ore (t) Waste (t) Rehandle (t) Relocation (t) Scats (t) MReef (OP) (t) SLC (t) M35 (t) Day Of Week
4 120149.0 58387.0 27402.0 52557.0 2781.0 22975.0 1646.0 30985.0 49302.0 728.0 11258.0 474.0 8180.0 730.0 1
7 119014.0 69818.0 49222.0 60999.0 25233.0 15478.0 8511.0 20596.0 33778.0 237.0 13193.0 1988.0 7610.0 1077.0 4
9 103769.0 64781.0 49020.0 41465.0 13163.0 27540.0 8317.0 15761.0 27301.0 2460.0 8226.0 1001.0 15640.0 1198.0 6
10 105386.0 76665.0 75495.0 63647.0 52048.0 6878.0 16569.0 1170.0 10380.0 234.0 16888.0 1219.0 16040.0 1252.0 0
18 97442.0 81449.0 64235.0 48802.0 44552.0 2035.0 17648.0 17214.0 2511.0 588.0 11155.0 1739.0 10440.0 1196.0 1

In [9]:
# Use default plotting options to forma scatter matrix
fig1 = scatter_matrix(df)



In [10]:
# Lets format the scatter matrix differently
matplotlib.style.use('ggplot')
fig2 = scatter_matrix(df, alpha=0.2, figsize=(15, 15), diagonal='kde')



In [11]:
# Perform scatter matrix with another viz library - seaborn - using its defaults
fig3 = sns.pairplot(df, hue="Day Of Week")



In [12]:
# Lets do the same thing again removing the analysis at the "Day of Week" level
# The upper half of the matrix triangle will have regression plots for the pairs
# The diagonal will have distribution plots for the variable/feature in question
# The lower half of the triangle will have residual plots for the the pairs
fig4 = sns.PairGrid(df)  
fig4.map_upper(sns.regplot)  
fig4.map_lower(sns.residplot)  
fig4.map_diag(plt.hist)

for ax in fig4.axes.flat:  
    plt.setp(ax.get_xticklabels(), rotation=45)
    
fig4.add_legend()  
fig4.set(alpha=0.5)


Out[12]:
<seaborn.axisgrid.PairGrid at 0x304b8cf8>

In [13]:
# Lets view "Total ExPit Ore (t)" vs "Waste (t)" as there potentially
# looks to be a linear relationship
fig5 = sns.jointplot("Total ExPit Ore (t)", "Waste (t)", data=df, kind='kde')



In [14]:
# Lets view the same relationship, just styled differently
fig6 = sns.jointplot("Total ExPit Ore (t)", "Waste (t)", data=df, kind='reg')



In [15]:
# Look at the relationship through the days of the week
fig7 = sns.lmplot(x="Total ExPit Ore (t)", y="Waste (t)", data=df, col="Day Of Week", order=1);



In [16]:
# Create a box plot of the varaibles/features
date_cols_to_drop = ["Day Of Week", "StartDateTime"]
df = df.drop(date_cols_to_drop[1], axis=1)
fig8 = sns.boxplot(data=df, orient="h")